GPU Computing for Data Scientists

Using CUDA, Jupyter, PyCUDA, ArrayFire and Thrust

https://github.com/QuantScientist/Data-Science-ArrayFire-GPU


In [1]:
%reset -f
import pycuda
from pycuda import compiler
import pycuda.driver as drv
import pycuda.driver as cuda

Make sure we have CUDA


In [3]:
drv.init()
print("%d device(s) found." % drv.Device.count())           
for ordinal in range(drv.Device.count()):
    dev = drv.Device(ordinal)
    print ("Device #%d: %s" % (ordinal, dev.name()))

drv


1 device(s) found.
Device #0: GeForce GTX 1080
Out[3]:
<module 'pycuda.driver' from 'C:\\Anaconda3\\lib\\site-packages\\pycuda\\driver.py'>

Simple addition the GPU: compilation


In [5]:
import pycuda.autoinit
import numpy

from pycuda.compiler import SourceModule

srcGPU = """
    #include <stdio.h>
   __global__ void multGPU(float *dest, float *a, float *b)
{
  const int i = threadIdx.x;  
  dest[i] = a[i] * b[i];
  //dest[i] = threadIdx.x + threadIdx.y + blockDim.x;
  //dest[i] = blockDim.x;
  //printf("I am %d.%d\\n", threadIdx.x, threadIdx.y);
  
}
"""

srcGPUModule = SourceModule(srcGPU)

print (srcGPUModule)


<pycuda.compiler.SourceModule object at 0x0000021E2268D710>
C:\Anaconda3\lib\site-packages\ipykernel_launcher.py:19: UserWarning: The CUDA compiler succeeded, but said the following:
kernel.cu

Simple addition on the GPU: Host memory allocation


In [6]:
ARR_SIZE=16

a = numpy.random.randn(ARR_SIZE).astype(numpy.float32)
a=numpy.ones_like(a)*3
b = numpy.random.randn(ARR_SIZE).astype(numpy.float32)
b=numpy.ones_like(b)*2

dest = numpy.zeros_like(a)
# print dest

Simple addition on the GPU: execution


In [9]:
multGPUFunc = srcGPUModule.get_function("multGPU")

print (multGPUFunc)

multGPUFunc(drv.Out(dest), drv.In(a), drv.In(b),
                                          block=(ARR_SIZE,32,1))
print (dest)


<pycuda._driver.Function object at 0x0000021E226987A0>
[ 6.  6.  6.  6.  6.  6.  6.  6.  6.  6.  6.  6.  6.  6.  6.  6.]

In [10]:
# print "Calculating %d iterations" % (n_iter)
import timeit

rounds =3
print ('pycuda', timeit.timeit(lambda: 
                              multGPUFunc(drv.Out(dest), drv.In(a), drv.In(b),
                                          grid=(ARR_SIZE,1,1), 
                                          block=(1,1,1)), 
                              number=rounds))
# print dest

# print 'pycuda', timeit.timeit(lambda: 
#                               multGPUFunc(drv.Out(dest), drv.In(a), drv.In(b),                                          
#                                           block=(ARR_SIZE,1,1)), 
#                               number=rounds)

# print dest


print ('npy', timeit.timeit(lambda:a*b , number=rounds))


pycuda 0.009389220357464863
npy 2.1461075102776825e-05

Threads and Blocks


In [11]:
a = numpy.random.randn(4,4)
a=numpy.ones_like(a)
a = a.astype(numpy.float32)

a_gpu = cuda.mem_alloc(a.nbytes)

cuda.memcpy_htod(a_gpu, a)

mod = SourceModule("""
    #include <stdio.h>
  __global__ void doublify(float *a)
  {
    int idx = threadIdx.x + threadIdx.y*4;
    a[idx] *= 2;
    //printf("I am %d.%d\\n", threadIdx.x, threadIdx.y);
    
    printf("I am %dth thread in threadIdx.x:%d.threadIdx.y:%d  blockIdx.:%d blockIdx.y:%d blockDim.x:%d blockDim.y:%d\\n",(threadIdx.x+threadIdx.y*blockDim.x+(blockIdx.x*blockDim.x*blockDim.y)+(blockIdx.y*blockDim.x*blockDim.y)),threadIdx.x, threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y);    
  }
  """)
  
func = mod.get_function("doublify")
func(a_gpu, block=(16,1,1))

a_doubled = numpy.empty_like(a)
cuda.memcpy_dtoh(a_doubled, a_gpu)
print (a_doubled)


[[ 2.  2.  2.  2.]
 [ 2.  2.  2.  2.]
 [ 2.  2.  2.  2.]
 [ 2.  2.  2.  2.]]
C:\Anaconda3\lib\site-packages\ipykernel_launcher.py:19: UserWarning: The CUDA compiler succeeded, but said the following:
kernel.cu

[block]


In [ ]:


In [ ]: